library('RSpectra')
library("TopicScore")
library('tidyverse')
library('trimcluster')
library('extremefit')
library('igraph')
library('R.matlab')



SCOREplus <- function(A, k, c = 0.1, r = NULL){
  
  
  # if r not give, set to be k+1
  if (is.null(r)){
    fix.latent.dim = F
    r = k + 1
  } else {
    fix.latent.dim = T # otherwise latent dimension is given and fixed
  }
  
  n = nrow(A) # number of nodes
  degrees = rowSums(A)
  delta = c * max( degrees) # tunning parameter for graph laplacian
  d.inv = 1 / sqrt( delta + degrees )
  
  L.delta = t(d.inv * A) * d.inv # graph laplacian with ridge regularization
  
  # get top r eigenvectors
  eig.out = RSpectra::eigs(L.delta, k = r)
  eig.vec.w = eig.out$vectors %*% diag(eig.out$values) # reweight eigenvectors by eigen values
  
  # get ratio matrix 
  ratios = eig.vec.w[,2:r] / eig.vec.w[,1]
  
  if(!fix.latent.dim){
    # decide latent dimension by eigen-gap
    signal.weakness = 1 - eig.out$values[k+1] / eig.out$values[k]
    if ( Re(signal.weakness) > 0.1 ){
      ratios = ratios[,1:(k-1)]
    }
  }
  
  
  # k-means
  labels = kmeans(ratios, k, nstart = 100, iter.max = 100)$cluster
  
  return(list(labels = labels,
              ratios = ratios,
              delta = delta,
              eig.vec = eig.out$vectors,
              eig.val = eig.out$values))
}


cor2 <- function(x, y){
  if(sum(x * y) == 0){
    0
  }
  else{
    sum(x * y) / sqrt(sum(x * x)) / sqrt(sum(y * y))
  }
}


D_read <- read.csv(file = "citeseerD.csv")
D_tot <- as.matrix(D_read[, -1])
n_tot <- dim(D_tot)[1]

Pi_read <- read.csv(file = "citeseerPi.csv")
Pi_tot <- as.matrix(Pi_read[, -1])
k <- dim(Pi_tot)[2]

X_read <- read.csv(file = "citeseerX.csv")
X_tot <- as.matrix(X_read[, -1])



X_f <- X_tot / rowSums(X_tot)
X_D <- X_f %*% t(X_f) 
X_Dn <- X_D  - diag(diag(X_D))


D_copy <- D_tot
D_tot <- D_copy  + 10 * X_Dn 


set.seed(123)
fold_num <- 10
fold_size <- floor(n_tot / fold_num)
semi_ratio <- 0.036
ratio_len <- length(semi_ratio)



res_AngleMinPlus <-   rep(0, fold_num) %o% rep(0, ratio_len)
time_AngleMinPlus <- rep(0, fold_num) %o% rep(0, ratio_len)
cur_time <- 0
perm_tot <- sample(n_tot, n_tot)


for(ii in c(1:(fold_num))){
  
  test_ind_s <- (ii - 1) * fold_size + 1
  test_ind_t <- ii * fold_size
  if(ii == fold_num){
    test_ind_t <- n_tot
  }
  train_ind <- perm_tot[-c(test_ind_s:test_ind_t)]
  test_ind <- perm_tot[c(test_ind_s:test_ind_t)]


  perm_list_train <- sample(train_ind, length(train_ind))
  n <- length(train_ind)
  n_test <- length(test_ind)


  for(ratio_num in c(1:ratio_len)){
  n_L <- 120 # 0.036 * n_tot
  n_U <- n - n_L
  
  
  cur.time <- Sys.time()
  
  
  X_amin <- t(Pi_tot[perm_list_train[1:n_L], ]) %*% X_tot[perm_list_train[1:n_L], ]
  scale_X <- sqrt(sum(X_amin^2))
  coeff_temp <- 0.4
 
  temp_res <- SCOREplus(D_tot[perm_list_train[(n_L + 1):n], perm_list_train[(n_L + 1):n]], k)
  clu_res_label <- temp_res[[1]]
  
  Pi_est_w2 <- matrix(rep(0, 2 * n_tot * k), nrow = n_tot)
  Pi_est_w2[perm_list_train[1:n_L], 1:k] <- Pi_tot[perm_list_train[1:n_L], ]
  for(i in c(1:n_U)){
    Pi_est_w2[perm_list_train[i + n_L], k + clu_res_label[i]] <- 1
  } 
  
  V_AngleMinPlus0 <- t(Pi_tot[perm_list_train[1:n_L], ]) %*% D_tot[perm_list_train[1:n_L], ] %*%  Pi_est_w2 
  scale_AngleMinPlus <- sqrt(sum(V_AngleMinPlus0^2))
  V_AngleMinPlus <- cbind(V_AngleMinPlus0 / scale_AngleMinPlus, coeff_temp * X_amin  / scale_X)
  
  err_AngleMinPlus <- rep(0, n_tot)
  temp <- rep(0, k)
  for(i in test_ind){
    tempwi2 <- cbind(matrix(D_tot[i, ] %*% Pi_est_w2, nrow = 1) / scale_AngleMinPlus, coeff_temp * matrix(X_tot[i, ]  / scale_X, nrow = 1))
    if(sum(tempwi2^2) ==0){
      err_AngleMinPlus[i] <- rbinom(n = 1, size = 1, prob = 1 - 1 / k)
    }
    else{
      for(l in c(1:k)){
        temp[l] <- cor2(tempwi2, V_AngleMinPlus[l, ])
      }
      err_AngleMinPlus[i] <- 1 - Pi_tot[i, which.max(temp)]
    }
  }
  res_AngleMinPlus[ii, ratio_num] <- sum(err_AngleMinPlus[test_ind]) / n_test
  time_AngleMinPlus[ii, ratio_num] <- time_AngleMinPlus[ii, ratio_num] + Sys.time() - cur.time
  
  } 
}

print(mean(1 - res_lswn2))

